R/significance tests.R

Defines functions test_non_parametric perform_aov summary_stats summarise_wins save_flextable_docx plot_histogram plot_qq plot_residuals

# plot residuals over predictions (y_hat) or predictor (x)
plot_residuals <- function(data, x_variable) {
  ggplot(data, aes(y = residuals, x = !!rlang::sym(x_variable))) +
    geom_point() +
    geom_hline(yintercept = 0, linewidth = 0.5) +
    theme_minimal()
}

# plot Q-Q normal plot
plot_qq <- function(model_lm) {
  ggplot(model_lm) + 
    gglm::stat_normal_qq(alpha = 0.3) +
    theme_minimal()
}

# plot histogram from vector
plot_histogram <- function(vector, xlabel = "residuals", bins = 8, ymax = 12.5) {
  ggplot() + 
    geom_histogram(aes(x = vector), bins = bins, fill = "lightblue") +
    labs(x = xlabel) +
    lims(y = c(0, ymax)) +
    theme_minimal()
}

# save dataframe as doxc
save_flextable_docx <- function(data, digits = 3, filename) {
  data %>% 
    flextable::flextable() %>%
    flextable::colformat_double(digits = digits) %>%
    flextable::save_as_docx(path = filename)
}

# create grouped data for group comparisons
summarise_wins <- function(result_data, max_wins, grouping) {

  result_data %>%
    group_by(across(all_of(grouping))) %>%
    dplyr::summarise(
      mean = mean(wins / max_wins),
      median = median(wins / max_wins),
      sd = sd(wins / max_wins),
      se = sd / sqrt(n()),
      n = n()
    )
}

summary_stats <- function(
  data_set, grouping_labels, dv_label, remove_data = TRUE) {

  dv <- rlang::sym(dv_label)

  data_set %>%
    nest(data = !any_of(grouping_labels)) %>%
    dplyr::mutate(
      ci = map(data, ~ DescTools::MeanCI(.x[[dv]])),
      var = map(data, ~ var(.x[[dv]])),
      sd = map(data, ~ sd(.x[[dv]])),
      se = map(data, ~ sd(.x[[dv]]) / sqrt(nrow(.x))),
      n = map(data, ~ nrow(.x))
    ) %>%
    unnest(c(var, sd, se, n)) %>%
    unnest_wider(ci) %>%
    {
      if (remove_data) {
        select(., -data)
      } else {
        .
      }
    } %>%
    arrange(across(grouping_labels))
}

# assumes data_object is nested into a data column
perform_aov <- function(data_object, formula_aov) {

  response.label <- all.vars(formula_aov)[1]
  predictor.label <- all.vars(formula_aov)[2]
  predictor <- data_object$data %>% pluck(1) %>% .[[predictor.label]]

  data_object %>%
    dplyr::mutate(
      aov = map(data, ~ aov(formula_aov, data = .x)),
      lm = map(data, ~ lm(formula_aov, data = .x)),
      glanced = map(lm, broom::glance), # aov doesn't yield "statistic"
      tidied = map(aov, broom::tidy),
      # means per DV group
      means = map(data, ~.x %>% 
                    group_by(!!sym(predictor.label)) %>% 
                    summarise(mean = mean(!!sym(response.label)))),
      se = map(data, ~ sd(.x[[response.label]]) / sqrt(nrow(.x)))
    ) %>%
    {
      if (class(predictor) == "factor") {
        # posthoc scheffe shows which mean differences are significant
        dplyr::mutate(
          ., # TRICKY: LHS essential for inline pipe
          scheffe = map(aov, ~ DescTools::ScheffeTest(.x)) # only on factor
        )
      } else {
        .
      }
    }
}

test_non_parametric <- function(data_object, formula_nonparam) {

  require(DescTools)
  require(rstatix)

  response <- formula_nonparam %>% all.vars() %>% .[1]
  group <- formula_nonparam %>% all.vars() %>% .[2]

  data_object %>%
    dplyr::mutate(
      # Kruskal-Wallis test: non-parametric alternative to one-way ANOVA
      # uses sample medians instead of means
      kruskaled = map(data,
        ~ kruskal.test(formula_nonparam, data = .x) %>%
          broom::glance(.) %>%
          dplyr::rename(df = parameter)),
      # wilcox does not correct for multiple comparisons with pooled variance
      wilcoxed = map(data, ~ pairwise.wilcox.test(x = response, g = group)),
      # dunn better than wilcox
      # remove DescTools::DunnTest bec no df and precision limited to p < 2e-16
      # dunned = map(data, ~ DescTools::DunnTest(formula_nonparam, data = .x)),
      dunned = map(data, ~ rstatix::dunn_test(formula_nonparam, data = .x))
    )
}


test_smirnov <- function(data_object, score_var = "wins", cum_fun = "pnorm") {

  data_object %>%
    dplyr::mutate(
      # Kolmogorov-Smirnov test
      smirnoved = map(data, ~ .x[[score_var]] %>%
                        ks.test(., y = cum_fun, mean = mean(.), sd = sd(.)) %>%
                        broom::glance(.))
    )
}


test_normality <- function(data_object, formula) {

  response <- all.vars(formula)[1]

  data_object %>%
    test_smirnov(score_var = response) %>%
    dplyr::mutate(
      # Shapiro-Wilk test: preferably on residuals than DV
      # Source: https://psychometroscar.com/2018/07/11/normality-residuals-or-dependent-variable/
      shapiroed = map(aov, ~ .x %>%
                        residuals %>%
                        shapiro.test() %>%
                        broom::glance(.)),
      shapiroed.dv = map(data, ~ .x[[response]] %>%
                         shapiro.test() %>%
                         broom::glance(.)),

      # Welch test is a form of ANOVA that allows for heterogeneity
      welched = map(
        data, ~ oneway.test(formula, data = .x, var.equal = FALSE) %>%
          broom::glance(.))
    )
}


test_homogeneity <- function(data_object, formula) {

  require(onewaytests) # # Brown-Forsythe Test
  require(DescTools)

  predictor.label <- all.vars(formula)[2]
  predictor <- data_object$data %>% pluck(1) %>% .[[predictor.label]]

  data_object %>%
    dplyr::mutate(
      # Bartlett Test of Homogeneity of Variances BUT only one-way ANOVA!
      bartletted = map(data, ~ bartlett.test(formula, data = .x) %>%
                         broom::glance(.)),

      # Fligner-Killeen Test of Homogeneity of Variances
      flignered = map(data, ~ fligner.test(formula, data = .x) %>%
                        broom::glance(.)),

    ) %>%
    {
      if (class(predictor) == "factor") {
        dplyr::mutate(
          # very tricky: for inline conditions, need explicit LHS!
          .,
          # Levene Test: homogeneious if p > 0.05, works only on factor
          levened = map(data, ~ DescTools::LeveneTest(formula, data = .x)),

          # Brown-Forsythe Test
          brownforsythe = map(data, ~ bf.test(formula, data = .x)),
          brownforsythed = map(
            brownforsythe,
            ~ tibble(statistic = .x$statistic, p.value = .x$p.value)
          )
        )

      } else {
        .
      }
    }



}

test_independence <- function(data_object, model_label = "aov") {

  require(DescTools)
  model <- rlang::sym(model_label)
  data_object %>%
    dplyr::mutate(
      # independent samples test
      durbined = map(!!model, ~ DescTools::DurbinWatsonTest(.x) %>%
                     broom::glance(.))
    )
}


create_plots_lm  <- function(data_object, model_label = "aov") {

  require(gglm)
  model <- rlang::sym(model_label)
  data_object %>%
    dplyr::mutate(
      plot.residuals = map(!!model, ~ ggplot(data = .x) + stat_fitted_resid() ),
      plot.qq = imap(!!model, ~ ggplot(data = .x) +
                      stat_normal_qq(alpha = 0.2) +
                      labs(
                        title = paste(.y),
                        x = "", y = ""
                      )
                    ),
      plot.leverage = map(!!model, ~ ggplot(data = .x) + stat_resid_leverage() ),
      hist.residuals  = map(!!model, ~ ggplot(data = .x) + stat_resid_hist() ),
      plot.scale = map(!!model, ~ ggplot(data = .x) + stat_scale_location() ),
    )
}


print_stats <- function(data_set,
                        stat_type,
                        grouping = NULL,
                        save_label = "",
                        format = "html",
                        kable = FALSE,
                        latex = FALSE,
                        digits = 2,
                        ...) {

  require(dplyr)

  result.table <- data_set %>%
    purrr::when(
      stat_type == "glanced" ~ {
        
        unnest(., c(glanced, se)) %>%
          select(grouping,
                 r.squared, adj.r.squared,
                 se, df,
                 F = statistic, p.value # glanced
          )
        
      },
      stat_type == "tidied" ~ { # same as glanced
        
        unnest(., tidied) %>%
          select(term, grouping, F = statistic, p.value)
        
      },
      stat_type == "levened" ~ {

        unnest(., levened) %>%
          select(grouping, F.levene = `F value`, p.levene = `Pr(>F)`) %>%
          filter(!is.na(F.levene))

      },
      stat_type == "shapiroed" ~ {

        unnest(., shapiroed) %>%
          select(grouping, W = statistic, p.shapiro = p.value)

      },
      stat_type == "kruskaled" ~ {

        unnest(., kruskaled) %>%
          select(grouping, K = statistic, p.kruskal = p.value)

      },
      stat_type == "dunned" ~ {

        unnest(., dunned) %>%
          mutate(`eta-squared` = statistic^2 / (n1 + n2)) %>%
          select(grouping, group1, group2,
                 z = statistic, `eta-squared`,
                 p, p.adj, p.adj.signif
                 ) %>%
          unite("groups", c(group1, group2), sep = "~")

      },
      stat_type == "durbined" ~ {

        unnest(., durbined) %>%
          select(grouping, autocorrelation,
                 dw = statistic, p.durbin = p.value)

      },
      TRUE ~ {
        # covers bartletted, flignered, brownforsythed
        unnest(., stat_type, names_repair = "minimal") %>%
          select(grouping, statistic, p.value)
      }
    ) %>%
    {
      if (!is.null(grouping)) {

        dplyr::arrange(., across(grouping))
      } else {
        .
      }
    }

  if (kable) {

    result.table %>%
      convert_kable(., digits = digits) %>%
      {
        if (save_label != "") {

          cat(., file = paste0(save_label, ".", format))

        } else {
          .
        }
      }
  }

  if (latex) {

    result.table %>%
      convert_latex(., digits = digits) %>%
      {
        if (save_label != "") {

          require(stargazer)
          stargazer(., out = paste0(save_label, ".tex"))

        } else {
          .
        }
      }
  }

  result.table

}

convert_kable <- function(data, digits = 4, format = "html", ...) {

  require(knitr)
  require(kableExtra)

  data %>%
    knitr::kable(format = format, digits = digits) %>%
    kableExtra::kable_styling(bootstrap_options = c("bordered", "hover")) %>% print

}

convert_latex <- function(data, digits = 4, ...) {

  require(xtable)
  data %>%
    xtable(digits = digits, ...) %>% print

}

analyze_aov <- function(
  data_object, nesting_labels, formula_aov, score_var = "score",
  test_independence = TRUE,
  test_homogeneity = TRUE,
  test_normality = TRUE,
  create_plots_lm = TRUE
) {

  data_object %>%
    nest(data = -nesting_labels) %>%
    perform_aov(formula_aov) %>%
    {
      if (test_independence) {
        test_independence(.)
      } else {
        .
      }
    } %>%
    {
      if (test_homogeneity) {
        test_homogeneity(., formula_aov)
      } else {
        .
      }
    } %>%
    {
      if (test_normality) {
        test_normality(., formula_aov)
      } else {
        .
      }
    } %>%
    {
      if (create_plots_lm) {
        create_plots_lm(.)
      } else {
        .
      }
    }
}

analyze_non_parametric <- function(
  data_object, nesting_labels, formula_nonparam,
  test_homogeneity = TRUE
) {

  data_object %>%
    nest(data = -nesting_labels) %>%
    test_non_parametric(formula_nonparam) %>%
    {
      if (test_homogeneity) {
        test_homogeneity(., formula_nonparam)
      } else {
        .
      }
    }
}

paired_ttest <- function(
  data_post, data_pre,
  meta.type = "SMCC",
  vars = "vb",
  simple = TRUE) {

  n <- data_post %>% nrow()

  tt <- map2(data_post, data_pre,
             ~ t.test(.x, .y, paired = TRUE) %>%
               .[c("estimate", "p.value", "stderr")] %>%
               as_tibble %>%
               dplyr::rename(.,
                             m.diff = estimate,
                             se.diff = stderr,
                             p.diff = p.value
               ) %>%
               mutate(
                 d.diff = m.diff / (sqrt(n) * se.diff),
                 sd.pre = sd(.y),
                 sd.post = sd(.x),
                 r = cor(.x, .y),
                 # Ross & DeShon (2002): correlated sds
                 d.rm = (m.diff * sqrt(2*(1-r)) )/
                   (sd.pre^2 + sd.post^2 - 2*r*sd.pre*sd.post),
                 # Ross & DeShon (2002): prior sd
                 # d.sd.pre = (estimate * sqrt(2*(1-r)) )/ sd.pre,
                 d.sd.pre = m.diff/ sd.pre,
                 # change score standardization Wolfgang
                 d.mf = metafor::escalc(
                   measure = meta.type,
                   x1i = !!.x, x2i = !!.y,
                   m1i = mean(!!.x), m2i = mean(!!.y),
                   sd1i = sd(!!.x), sd2i = sd(!!.y),
                   ni = nrow(!!data_post),
                   ri = cor(!!.x, !!.y)
                 ) %>%
                   summary() %>%
                   select(yi, pval)
                 ,
                 m1 = mean(!!.y), m2 = mean(!!.x)
               )
  ) %>%
    imap_dfr( ~.x, .id = "item") %>%
    select(., item, m1, m2, m.diff, se.diff,
           sd.pre, sd.post, r,
           d.diff, p.diff, d.rm, d.sd.pre, d.mf)

  if (vars == "vb") {
    tt <- tt %>%
      filter(item %in% vb.perceptions)
  }

  if (simple) {
    tt <- tt %>%
      select(item, m1, m2, m.diff, se.diff,
             d.diff, p.diff, d.mf)
  }

  tt %>%
    select(., -d.mf) %>%
    bind_cols(tt[["d.mf"]]) %>%
    dplyr::rename(d.smcc = yi, p.smcc = pval)

}
agilebean/machinelearningtools documentation built on May 2, 2023, 7:38 p.m.